* This Stata dofile is written to accompany the papers:
* A Leigh & P van der Eng, 'Inequality in Indonesia: What Can We Learn from Top Incomes?' (2009), Journal of Public Economics, 93(1-2): 209-212
* A Leigh & P van der Eng, 'Top Incomes in Indonesia 1920-2004' in A.B. Atkinson and T. Piketty (eds) (2009) Top Incomes Over the Twentieth Century: Volume II - A Global Perspective, Oxford, Oxford University Press
* It follows an Excel routine devised by Tony Atkinson.
* Feel free to use or adapt it, but please cite those papers.
* Questions to andrew_leigh@ksg02.harvard.edu

clear
set more off
program drop _all
postutil clear
cd "C:\Users\Andrew\My publications\Indonesian Top Incomes\"

* Files are 
  * 1920 - persons
  * 1921 - persons 
  * 1930, 1933 - persons, income
  * 1934, 1938 - persons, income, wages 

* What's the right assumption about the top band income (for 1920-29)?
for any 1930 1931: use 1930, clear \ drop if personsX==0 \ gen n=_n \ gsort -n \ gen ratio=1000*incomeX/personsX/band in 1 \ di ratio
for num 1933/1934: use 1933, clear \ drop if personsX==0 \ gen n=_n \ gsort -n \ gen ratio=1000*incomeX/personsX/band in 1 \ di ratio
for num 1935/1937: use 1934, clear \ drop if personsX==0 \ gen n=_n \ gsort -n \ gen ratio=incomeX/personsX/band in 1 \ di ratio
for num 1938/1939: use 1938, clear \ drop if personsX==0 \ gen n=_n \ gsort -n \ gen ratio=incomeX/personsX/band in 1 \ di ratio

****************************************
* topinc program 
****************************************

program define topinc1
 ren households population
 sort band
 gen n=_n
 gen N=_N
 gsort -n
 gen rn=_n
 tsset rn
 ren persons persons
 if (year>=1930 & year<=1934) {
 replace income=income*1000
 }
 if year>=1930 {
 ren income income
 }
 else {
 gen income=persons*(band+l.band)*0.5
 replace income=persons*(band+((band-f.band)*0.5)) if rn==1
 *ASSUMING THAT THOSE IN TOP BAND HAVE AVERAGE INCOME 1.4 TIMES LOWER LIMIT: replace income=persons*band*1.4 if rn==1
 *Lower bound
 *gen income=persons*band
 *Upper bound
 *gen income=persons*l.band
 *replace income=persons*(band+((band-f.band)*0.5)) if rn==1
 }
 for any population personalincome: egen temp=max(X) \ replace X=temp \ drop temp
 gen meanincome=(personalincome)/population
 gen pctfreq=(persons/population)*100
 gen cumpctfreq=pctfreq if rn==1
 replace cumpctfreq=pctfreq+l.cumpctfreq if rn>1
 gen totalincome=(income/personalincome)*100
 gen cumtotalincome=totalincome if rn==1
 replace cumtotalincome=totalincome+l.cumtotalincome if rn>1
gen lowerlim:"Lower limit relative to mean"=band/meanincome
gen cellmean=(totalincome/pctfreq)*meanincome
gen cellmeanrel:"Cell mean relative to mean"=cellmean/meanincome
tsset n
gen midptmean=(lowerlim+f.lowerlim)/2-cellmeanrel if n>1
gen alphacalc:"Alpha calc from F"=ln(cumpctfreq/f.cumpctfreq)/ln(f.lowerlim/lowerlim) if n>1
gen hupper=(cumpctfreq*2*(cellmeanrel-lowerlim)+f.cumpctfreq*(lowerlim+f.lowerlim-2*cellmeanrel))/(f.lowerlim-lowerlim) if n>1
gen hmeansplit=(cumpctfreq*(cellmeanrel-lowerlim)+f.cumpctfreq*(f.lowerlim-cellmeanrel))/(f.lowerlim-lowerlim) if n>1

* LINEAR BOUNDS - LOWER
for any 10 5 1 05 01 005 001 \ num 10 5 1 .5 .1 .05 .01 : gen lowerX=cumtotalinc+cellmeanrel*(Y-cumpctfreq) if cumpctfreq>Y & f.cumpctfreq<=Y

* LINEAR BOUNDS - UPPER
for any  10 5 1 05 01 005 001 \ num 10 5 1 .5 .1 .05 .01 : gen upperX=min((cumtotalinc+lowerlim*(Y-cumpctfreq)),(f.cumtotalinc+f.lowerlim*(Y-f.cumpctfreq))) if cumpctfreq>Y & f.cumpctfreq<=Y

* REFINED BOUNDS - LOWER RESTRICTED BOUND FOR SHARE
gen lowerdensity=pctfreq/(2*(cellmeanrel-lowerlim)) if n>1
for any 10 5 1 05 01 005 001 \ num 10 5 1 .5 .1 .05 .01: gen lboundX=cumtotalinc-(cumpctfreq-Y)*(lowerlim+(cumpctfreq-Y)/(2*lowerdensity)) if lowerX~=.

* REFINED BOUNDS - UPPER RESTRICTED BOUND FOR SHARE
gen upperdensity= pctfreq*2*(cellmeanrel-lowerlim)/((f.lowerlim-lowerlim)^2) if n>1
for any  10 5 1 05 01 005 001 \ num 10 5 1 .5 .1 .05 .01 : gen uboundX=cumtotalinc-(cumpctfreq-Y)*(lowerlim) if lowerX~=. & Y>hupper
for any  10 5 1 05 01 005 001 \ num 10 5 1 .5 .1 .05 .01 : replace uboundX=f.cumtotalinc+(Y-f.cumpctfreq)*(f.lowerlim-(Y-f.cumpctfreq)/(2*upperdensity)) if lowerX~=. & Y<=hupper

* MEAN SPLIT HISTOGRAM 
gen msh_higherden=pctfreq*((f.lowerlim-cellmeanrel)/(cellmeanrel-lowerlim))/(f.lowerlim-lowerlim) if n>1
gen msh_lowerdensity=pctfreq*((cellmeanrel-lowerlim)/(f.lowerlim-lowerlim))/(f.lowerlim-cellmeanrel) if n>1
for any  10 5 1 05 01 005 001 \ num 10 5 1 .5 .1 .05 .01 : gen mshX=cumtotalinc-(cumpctfreq-Y)*(lowerlim+(cumpctfreq-Y)/(2*msh_h)) if lowerX~=. & Y>hmeansplit
for any  10 5 1 05 01 005 001 \ num 10 5 1 .5 .1 .05 .01 : replace mshX=f.cumtotalinc+(Y-f.cumpctfreq)*(f.lowerlim-(Y-f.cumpctfreq)/(2*msh_l)) if lowerX~=. & Y<=hmeansplit

* CALCULATING BAND CUTOFFS
gen index=1 if msh_higher>=msh_lower
recode index .=-1
for any 10 5 1 05 01 005 001 \ num 10 5 1 .5 .1 .05 .01 : gen cutoffX=meanincome*index*max((index*(lowerlim+(cumpctfreq-Y)/msh_higherden)),(index*(f.lowerlim-(Y-f.cumpctfreq)/msh_lower))) if lowerX~=.

* COLLAPSING AND POSTING RESULTS
for any  10 5 1 05 01 005 001: gen shareX=mshX
*collapse (max) year (max) share* (max) cutoff*
*post _1 (year) (share10) (share5) (share1) (share05) (share01) (share005) (share001) (cutoff10) (cutoff5) (cutoff1) (cutoff05) (cutoff01) (cutoff005) (cutoff001)
egen taxpayers=sum(persons) 
egen taxableincome=sum(income)
gen meanincome_taxpayers=taxableincome/taxpayers
collapse (max) year (max) share* (max) lower* (max) population (max) taxpayers (max) cutoff* (max) meanincome meanincome_taxpayers (max) personalincome (max) taxableincome
post _1 (year) (share10) (share5) (share1) (share05) (share01) (share005) (share001) (cutoff10) (cutoff5) (cutoff1) (cutoff05) (cutoff01) (cutoff005) (cutoff001) (lower10) (lower5) (lower1) (lower05) (lower01) (lower005) (lower001) (population) (taxpayers) (meanincome) (meanincome_taxpayers) (personalincome) (taxableincome)
end

***************************************************************
* This next section runs the above program, and saves the results in shares-ext.dta
***************************************************************

cd "C:\Users\Andrew\My publications\Indonesian Top Incomes\"

* Estimating top income shares 1920-39
postfile _1 year share10 share5 share1 share05 share01 share005 share001 cutoff10 cutoff5 cutoff1 cutoff05 cutoff01 cutoff005 cutoff001 lower10 lower5 lower1 lower05 lower01 lower005 lower001 population taxpayers meanincome meanincome_taxpayers personalincome taxableincome using shares-ext.dta, replace
for num 1920/1939 \ num 1920 1921 1921 1921 1921 1921 1921 1921 1921 1921 1930 1930 1930 1933 1933 1934 1934 1934 1938 1938: use Y, clear \ keep band *X \ gen year=X \ sort year \ merge year using controltotals-1920, nokeep \ drop _merge \ topinc1
postclose _1

* Estimating wage share of top incomes 1920-39
postfile _1 year share10 share5 share1 share05 share01 share005 share001 cutoff10 cutoff5 cutoff1 cutoff05 cutoff01 cutoff005 cutoff001 lower10 lower5 lower1 lower05 lower01 lower005 lower001 population taxpayers meanincome meanincome_taxpayers personalincome taxableincome using temp.dta, replace
for num 1935/1939 \ num 1934 1934 1934 1938 1938: use Y, clear \ keep band *X \ gen year=X \ sort year \ merge year using controltotals-1920, nokeep \ drop _merge \ replace income=wages \ topinc1
postclose _1

use shares-ext.dta, clear
* Simple decision rule: if cutoff < 150% of mean income across society, drop the estimate
for num 10 5 1: replace shareX=. if cutoffX<1.5*meanincome 
assert share5-share1>7.8
assert share10-share5>8
* Dropping non-credible estimates
replace share1=. if year==1920
sort year
save shares-ext.dta, replace
use temp.dta, clear
for any  10 5 1 05 01 005 001: ren lowerX lower_salaryX
for any  10 5 1 05 01 005 001: ren shareX share_salaryX

keep lower_salary* share_salary* year
sort year
merge year using shares-ext.dta
for any  10 5 1 05 01 005 001: gen salary_pctX=share_salaryX/shareX \ gen salary_lb_pctX=lower_salaryX/lowerX
keep year salary_lb_pct*
sort year
save shares-salarypct.dta, replace

* How many bands are there?
for num 1920 1921 1930 1933 1934 1938: use X,clear \ sum 

* Control Totals
use controltotals-1920, clear
*drop personalincome_incjava population_incjava population
drop if year>=1939
merge year using shares-ext, keep(taxpayers meanincome)
drop _merge
replace meanincome=int(meanincome)
order year households taxpayers personalincome meanincome 

use shares-ext.dta, clear

* Diagnostics
gen taxpoppct=taxpayers/population
gen taxincpct=taxableincome/personalincome
tsset year
for any taxableincome personalincome: gen dX=(X-l.X)/l.X
for any meanincome meanincome_taxpayers: replace X=int(X)
list year meanincome meanincome_taxpayers,clean noo

gen _9095=share10-share5
gen _9599=share5-share1
sum _9095 _9599,d
drop _9095 _9599

use shares-salarypct.dta, clear
